package com.charm.tools;

import java.util.Map;
import java.util.HashMap;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.BufferedReader;
import java.io.BufferedWriter;

/**
 * 对语料库重新计算所有分词的IDF值
 * @author gonglibin
 * 2017.11.28
 */

public class CmIdfConfig {
	private static final int CM_DOCS = 1930;
	private static final String CM_FSRC = "./dat/sample.dat";
	private static final String CM_FDST = "./dat/sample.idf";
	
	public static class CmCnt {
		public int doc;
		public int num;
		
		public CmCnt(int d) {doc = d; num = 1;}
		public CmCnt(int d, int n) {doc = d; num = n;}
		public CmCnt CmCntPlus(int d) {return d != doc ? new CmCnt(d, num + 1) : this;}
	}
	
	public static void main(String[] args) {
		try {
			int doc = 1;
			CmCnt cnt = null;
			String buf = new String();
			Map<String, CmCnt> map = new HashMap<String, CmCnt>();
			BufferedReader br = new BufferedReader(new FileReader(CM_FSRC));
			BufferedWriter bw = new BufferedWriter(new FileWriter(CM_FDST));
			
			while (null != (buf = br.readLine())) {
				String[] arr = buf.split("\t");
				for (String v : arr[0].split(" ")) {
					map.put(v, null != (cnt = map.get(v)) ? cnt.CmCntPlus(doc) : new CmCnt(doc));
				}
				++ doc;
			}
			
			for (Map.Entry<String, CmCnt> v : map.entrySet()) {
				// IDF = log(文档总数 / (命中数量 + 1))
				double inv = CM_DOCS / Double.valueOf(v.getValue().num);
				double idf = Math.log(inv) / Math.log(10);
				bw.write(v.getKey() + "\t" + idf + "\n");
			}
			
			br.close();
			bw.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}